############# Script prepares Word Embeddings and Dictionaries##################
############# Mostly Replication from Rice & Zorn 2019 #########################
################################################################################

#####################Prepare Corpus for Embeddings##############################

library(dplyr)
library(tidyverse)
library(quanteda)
library(text2vec)
### Set Working Directory to Root Folder, applies to all scripts ###

load("Data/EP_prepped_110323.Rdata")

### Remove left overs from texts
EP_challenger$text <- gsub("\\(.*?\\)", "", EP_challenger$text) # remove everything in brackets
EP_challenger$text <- gsub("European Parliament\n", "", EP_challenger$text) # remove EP mentioning in the beginning
EP_challenger$text <- gsub("\\s+"," ",EP_challenger$text)
EP_challenger$text <- tolower(EP_challenger$text)

################################################################################
##################### Train Local Embeddings (Rice and Zorn 2019) ################


#### When replicating you might want to load these objects as they are quite time-consuming to produce ################
#load(c("NLP_Model/word_vectors_28102021.Rdata"))
#load(c("NLP_Model/main_28102021.Rdata"))
#load(c("NLP_Model/w2v_28102021.Rdata"))
#load(c("NLP_Model/context_28102021.Rdata"))


new_corpus <- corpus(EP_challenger, text_field = "text",docid_field = "doc_id", unique_docnames = FALSE)
new_dfm <- dfm(new_corpus,
               verbose = TRUE, remove = c(stopwords("english"), "-", "+", "<", "u", ">", "also", "mr", "mrs", "president" ,"can", "one", "like"),
               tolower = TRUE,
               remove_numbers = TRUE,
               remove_punct = TRUE)
topfeatures(new_dfm,100)
nfeat(new_dfm)

tokscomplete <- tokens(new_corpus) %>% tokens_select(selection = 'remove', min_nchar = 2)

featscomplete <- dfm(tokscomplete, verbose = TRUE, remove = c(stopwords("english"),"-", "+", "<", "u", ">", "also", "mr", "mrs", "president" ,"can", "one", "like"),
                     tolower = TRUE,
                     remove_numbers = TRUE,
                     remove_punct = TRUE) %>% dfm_trim(min_termfreq = 100, min_docfreq = 0.2) %>% featnames()

toks <- tokens_select(tokscomplete, featscomplete, padding = TRUE)
complete_fcm <- fcm(toks, context = "window", count = "weighted", weights = 1 / (1:5), tri = TRUE)
glove_complete <- GlobalVectors$new(rank  = 150, x_max = 10)
main = glove_complete$fit_transform(complete_fcm, n_iter = 100, convergence_tol = 0.001, n_threads = 8, learning_rate = 0.05, lambda = 1e-5)
context <- glove_complete$components
dfm_vectors <- as.dfm(t(main) + context)

df_main <- as.data.frame(main)
df_main <- cbind(rownames(df_main), data.frame(df_main, row.names = NULL))
colnames(df_main)[1] <- "word"
w2v <- df_main[df_main$word %in% featnames(new_dfm),]

word_vectors = main + t(context)

#save(w2v, file ="w2v_28102021.Rdata")
#save(word_vectors, file ="word_vectors_28102021.Rdata")
#save(main, file ="main_28102021.Rdata")
#save(context, file ="context_28102021.Rdata")


anti_establishment_seed =
  word_vectors["absurd", , drop = FALSE]+
  word_vectors["arrogant", , drop = FALSE]+
  word_vectors["corrupt", , drop = FALSE]+
  word_vectors["undemocratic", , drop = FALSE]+
  word_vectors["shameful", , drop = FALSE]+
  word_vectors["anti-grassroots", , drop = FALSE]+
  word_vectors["grassroots", , drop = FALSE]+
  word_vectors["imperialist", , drop = FALSE]+
  word_vectors["unelected", , drop = FALSE]+
  word_vectors["admit", , drop = FALSE]+
  word_vectors["elite", , drop = FALSE]+
  word_vectors["bureaucrats", , drop = FALSE]+
  word_vectors["short-sighted", , drop = FALSE]+
  word_vectors["unaccountable", , drop = FALSE]+
  word_vectors["unacceptable", , drop = FALSE]+
  word_vectors["cynical", , drop = FALSE]+
  word_vectors["authoritarian", , drop = FALSE]+
  word_vectors["anti-democratic", , drop = FALSE]+
  word_vectors["disgraceful", , drop = FALSE]+
  word_vectors["anti-grass", , drop = FALSE]+
  word_vectors["grass-roots", , drop = FALSE]+
  word_vectors["domination", , drop = FALSE]+
  word_vectors["frankly", , drop = FALSE]+
  word_vectors["elites", , drop = FALSE]+
  word_vectors["technocrats", , drop = FALSE]+
  word_vectors["cynical", , drop = FALSE]+
  word_vectors["prematurely", , drop = FALSE]+
  word_vectors["incomprehensible", , drop = FALSE]+
  word_vectors["shameful", , drop = FALSE]+
  word_vectors["dictatorial", , drop = FALSE]+
  word_vectors["technocratic", , drop = FALSE]+
  word_vectors["dreadful", , drop = FALSE]+
  word_vectors["reactionary", , drop = FALSE]+
  word_vectors["classes", , drop = FALSE]+
  word_vectors["renationalisation", , drop = FALSE]+
  word_vectors["unaccountable", , drop = FALSE]+
  word_vectors["admitted", , drop = FALSE]+
  word_vectors["circles", , drop = FALSE]+
  word_vectors["lobbyists", , drop = FALSE]+
  word_vectors["one-sided", , drop = FALSE]+
  word_vectors["ridiculous", , drop = FALSE]+
  word_vectors["unilateralism", , drop = FALSE]+
  word_vectors["non-transparent", , drop = FALSE]+
  word_vectors["appalling", , drop = FALSE]+
  word_vectors["struggle", , drop = FALSE]+
  word_vectors["illusory", , drop = FALSE]+
  word_vectors["accountable", , drop = FALSE]+
  word_vectors["disappointed", , drop = FALSE]+
  word_vectors["club", , drop = FALSE]+
  word_vectors["decision-makers", , drop = FALSE]+
  word_vectors["mistaken", , drop = FALSE]+
  word_vectors["utterly", , drop = FALSE]+
  word_vectors["short-sighted", , drop = FALSE]+
  word_vectors["incompetent", , drop = FALSE]+
  word_vectors["flawed", , drop = FALSE]+
  word_vectors["embarrassing", , drop = FALSE]+
  word_vectors["obsession", , drop = FALSE]+
  word_vectors["technocratic", , drop = FALSE]+
  word_vectors["dictates", , drop = FALSE]+
  word_vectors["foolish", , drop = FALSE]+
  word_vectors["scandalous", , drop = FALSE]+
  word_vectors["outrageous", , drop = FALSE]+
  word_vectors["unacceptable", , drop = FALSE]+
  word_vectors["unjust", , drop = FALSE]+
  word_vectors["frustrating", , drop = FALSE]+
  word_vectors["dishonest", , drop = FALSE]+
  word_vectors["inefficient", , drop = FALSE]+
  word_vectors["hypocritical", , drop = FALSE]+
  word_vectors["complacent", , drop = FALSE]+
  word_vectors["immoral", , drop = FALSE]+
  word_vectors["ineffective", , drop = FALSE]+
  word_vectors["inappropriate", , drop = FALSE]+
  word_vectors["arrogance", , drop = FALSE]+
  word_vectors["irresponsible", , drop = FALSE]


immigration_seed =
  word_vectors["dublin", , drop = FALSE]+
  word_vectors["migration", , drop = FALSE]+
  word_vectors["illegal", , drop = FALSE]+
  word_vectors["asylum", , drop = FALSE]+
  word_vectors["immigrants", , drop = FALSE]+
  word_vectors["migrants", , drop = FALSE]+
  word_vectors["refugees", , drop = FALSE]+
  word_vectors["seekers", , drop = FALSE]+
  word_vectors["migratory", , drop = FALSE]+
  word_vectors["integration", , drop = FALSE]+
  word_vectors["deportation", , drop = FALSE]+
  word_vectors["schengen", , drop = FALSE]+
  word_vectors["influx", , drop = FALSE]+
  word_vectors["displaced", , drop = FALSE]+
  word_vectors["frontex", , drop = FALSE]+
  word_vectors["repatriation", , drop = FALSE]+
  word_vectors["foreigners", , drop = FALSE]+
  word_vectors["dublin", , drop = FALSE]+
  word_vectors["eurodac", , drop = FALSE]+
  word_vectors["irregular", , drop = FALSE]+
  word_vectors["arriving", , drop = FALSE]+
  word_vectors["lampedusa", , drop = FALSE]+
  word_vectors["fleeing", , drop = FALSE]+
  word_vectors["mass", , drop = FALSE]+
  word_vectors["asylum-seekers", , drop = FALSE]



austerity_seed =
  word_vectors["austerity", , drop = FALSE]+
  word_vectors["debts", , drop = FALSE]+
  word_vectors["crisis", , drop = FALSE]+
  word_vectors["greece", , drop = FALSE]+
  word_vectors["eurozone", , drop = FALSE]+
  word_vectors["unemployment", , drop = FALSE] +
  word_vectors["euro", , drop = FALSE]+
  word_vectors["bailout", , drop = FALSE]+
  word_vectors["debt", , drop = FALSE]+
  word_vectors["inflation", , drop = FALSE]+
  word_vectors["ecb", , drop = FALSE]+
  word_vectors["budgetary", , drop = FALSE]+
  word_vectors["recession", , drop = FALSE]+
  word_vectors["troika", , drop = FALSE]+
  word_vectors["deficit", , drop = FALSE]+
  word_vectors["deficits", , drop = FALSE]+
  word_vectors["currency", , drop = FALSE]+
  word_vectors["monetary", , drop = FALSE]+
  word_vectors["fiscal", , drop = FALSE]+
  word_vectors["macroeconomic", , drop = FALSE]+
  word_vectors["macro-economic", , drop = FALSE]


european_integration_seed =
  word_vectors["integration", , drop = FALSE]+
  word_vectors["democratic", , drop = FALSE]+
  word_vectors["powers", , drop = FALSE]+
  word_vectors["parliament", , drop = FALSE]+
  word_vectors["union", , drop = FALSE]+
  word_vectors["eu", , drop = FALSE]+
  word_vectors["institutions", , drop = FALSE]+
  word_vectors["parliamentary", , drop = FALSE]+
  word_vectors["constitution", , drop = FALSE]+
  word_vectors["citizens", , drop = FALSE]+
  word_vectors["rights", , drop = FALSE]+
  word_vectors["parliaments", , drop = FALSE]+
  word_vectors["citizens", , drop = FALSE]+
  word_vectors["legitimacy", , drop = FALSE]+
  word_vectors["fundamental", , drop = FALSE]+
  word_vectors["community", , drop = FALSE]+
  word_vectors["constitutional", , drop = FALSE]+
  word_vectors["accountability", , drop = FALSE]+
  word_vectors["united", , drop = FALSE]+
  word_vectors["participation", , drop = FALSE]+
  word_vectors["elected", , drop = FALSE]



################################################################################
################################################################################
################################################################################


cos_sim = sim2(x = word_vectors, y = anti_establishment_seed, method = "cosine", norm = "l2")
head(sort(cos_sim[,1], decreasing = TRUE), 50)

popWords <- names(head(sort(cos_sim[,1], decreasing=T), 250))
popWeights <- head(sort(cos_sim[,1], decreasing=T), 250)
popWords

### find most similar documents (Rice and Zorn Code)

myTokens = itoken(EP_challenger$text, preprocess_function=tolower, tokenizer=word_tokenizer, progressbar=T, ids = EP_challenger$doc_id)
vocab=create_vocabulary(myTokens)
vocab <- prune_vocabulary(vocab, term_count_min = 100, doc_proportion_max = 0.9, doc_count_min = 100)
v = vocab
it = myTokens

pruned_vocab <- prune_vocabulary(v, term_count_min = 90, doc_proportion_max = 0.9, doc_proportion_min = 0.001)
vectorizer = vocab_vectorizer(pruned_vocab)
dtm = create_dtm(it, vectorizer)
wordCounts <- rowSums(dtm)
tfidf = TfIdf$new()
dtm = fit_transform(dtm, tfidf)


popDtm <- dtm[,which(colnames(dtm) %in% names(popWeights))]
tmpPopWeights <- popWeights[which(names(popWeights) %in% colnames(popDtm))]
popDtm <- popDtm[,order(colnames(popDtm))]
tmpPopWeights <- tmpPopWeights[order(names(tmpPopWeights))]
popCounts <- rowSums(popDtm * tmpPopWeights)
pop_names <- names(popCounts) 
pop_counts <- unname(popCounts)

################# create DF of most similar documents

df <- cbind.data.frame(pop_names, pop_counts)
colnames(df) <- c("doc_id", "embed_dict")
df <- df[order(df$doc_id),]

#############################FIND MOST SIMILAR AUSTERITY################################################

cos_sim_aut = sim2(x = word_vectors, y = austerity_seed, method = "cosine", norm = "l2")
head(sort(cos_sim_aut[,1], decreasing = TRUE), 50)

autWords <- names(head(sort(cos_sim_aut[,1], decreasing=T), 250))
autWeights <- head(sort(cos_sim_aut[,1], decreasing=T), 250)
autWords


autDtm <- dtm[,which(colnames(dtm) %in% names(autWeights))]
tmpautWeights <- autWeights[which(names(autWeights) %in% colnames(autDtm))]
autDtm <- autDtm[,order(colnames(autDtm))]
tmpautWeights <- tmpautWeights[order(names(tmpautWeights))]
autCounts <- rowSums(autDtm * tmpautWeights)
aut_names <- names(autCounts) 
aut_counts <- unname(autCounts)

df_aut <- cbind.data.frame(aut_names, aut_counts)
colnames(df_aut) <- c("doc_id", "aut_dict")
df_aut <- df_aut[order(df_aut$doc_id),]

#############################FIND MOST SIMILAR Immigration ################################################

cos_sim_immi = sim2(x = word_vectors, y = immigration_seed, method = "cosine", norm = "l2")
head(sort(cos_sim_immi[,1], decreasing = TRUE), 150)

immiWords <- names(head(sort(cos_sim_immi[,1], decreasing=T), 250))
immiWeights <- head(sort(cos_sim_immi[,1], decreasing=T), 250)
immiWords


immiDtm <- dtm[,which(colnames(dtm) %in% names(immiWeights))]
tmpimmiWeights <- immiWeights[which(names(immiWeights) %in% colnames(immiDtm))]
immiDtm <- immiDtm[,order(colnames(immiDtm))]
tmpimmiWeights <- tmpimmiWeights[order(names(tmpimmiWeights))]
immiCounts <- rowSums(immiDtm * tmpimmiWeights)
immi_names <- names(immiCounts) 
immi_counts <- unname(immiCounts)

df_immi <- cbind.data.frame(immi_names, immi_counts)
colnames(df_immi) <- c("doc_id", "immi_dict")
df_immi <- df_immi[order(df_immi$doc_id),]

#############################FIND MOST SIMILAR INTEGRATION################################################

cos_sim_int = sim2(x = word_vectors, y = european_integration_seed, method = "cosine", norm = "l2")
head(sort(cos_sim_int[,1], decreasing = TRUE), 250)

intWords <- names(head(sort(cos_sim_int[,1], decreasing=T), 250))
intWeights <- head(sort(cos_sim_int[,1], decreasing=T), 250)
intWords


intDtm <- dtm[,which(colnames(dtm) %in% names(intWeights))]
tmpintWeights <- intWeights[which(names(intWeights) %in% colnames(intDtm))]
intDtm <- intDtm[,order(colnames(intDtm))]
tmpintWeights <- tmpintWeights[order(names(tmpintWeights))]
intCounts <- rowSums(intDtm * tmpintWeights)
int_names <- names(intCounts) 
int_counts <- unname(intCounts)

df_int <- cbind.data.frame(int_names, int_counts)
colnames(df_int) <- c("doc_id", "int_dict")
df_int <- df_int[order(df_int$doc_id),]

########################################################################################################


df_dicts <- cbind.data.frame(df$doc_id, df$embed_dict, df_aut$aut_dict, df_immi$immi_dict, df_int$int_dict)
colnames(df_dicts) <- c("doc_id", "embed_dict", "aut_dict", "immi_dict", "int_dict")
df_dicts <- df_dicts[order(df_dicts$doc_id),]

EP_challenger <- EP_challenger[order(EP_challenger$doc_id),]
EP_complete_dict <- cbind.data.frame(EP_challenger, df_dicts)
save(EP_complete_dict, file = "Data/EP_complete_dict_110323.Rdata")

#### CHECKED AND WORKING ON 12.03.2023 ####

